###############################################
### Project: Communicating in an eventful   ###
###          campaign                       ###
### Task:    Analysis - Preparation         ###
### Title:   06_Analysis_Preparation.R      ###
###############################################

#--------------------------------------------------------------------------------------------------
# Description:
#
# This script creates the main data set used for the statistical analysis. The following lines of
# code produce the dependent and independent variables for the estimated regression models in the
# paper.
#--------------------------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

# Preparation

## load packages
library(tidyverse)
library(manifestoR)

## load text data (press releases)
data_text_pr <- read.csv("Classified_Pressreleases_20210401_20210926.csv", encoding = "UTF-8")
data_text_pr$party <- gsub("GrÃ¼ne", "Grüne", data_text_pr$party)
data_text_pr$issue <- gsub("_", " ", data_text_pr$issue)

## load text data (manifestos)
data_text_man <- readRDS("Corpus_Manifestos_DE_2021.RDS")

## load GLES data
data_gles <- readRDS("GLES_Data.RDS")

## load poll data
data_poll <- readRDS("Poll_Data.RDS")

#---------------------------------------------------------------------------------------------------------------------

# Dependent variables (press release salience and volume)

## filter raw data
df_dv <- data_text_pr %>%
  filter(!is.na(issue)) %>% 
  filter(type == "policy-related") %>%
  filter(date >= "2021-06-01")
df_dv$n <- 1

## calculate press release salience per party and month (June, July, August and September)
df_dv1 <- df_dv[, c(3, 6, 13, 34)] %>%
  group_by(party, issue, month) %>% 
  mutate(N = sum(n)) %>% 
  group_by(party, month) %>%
  mutate(overall_num = sum(n)) %>%
  mutate(pressrelease_salience = (N / overall_num) * 100) %>%
  unique()
df_dv1 <- df_dv1[, c(1:3, 7)]

## calculate press release volume per party and month (June, July, August and September)
df_dv2 <- df_dv[, c(3, 6, 13, 34)] %>%
  group_by(party, issue, month) %>% 
  mutate(pressrelease_volume = sum(n)) %>% 
  group_by(party, month) %>%
  unique()
df_dv2 <- df_dv2[, c(1:3, 5)]

## data frame with dependent variables
df_dv <- data.frame(party = rep(unique(df_dv1$party), each = length(unique(df_dv1$issue)))) %>%
  cbind(data.frame(issue = sort(unique(df_dv1$issue)))) %>%
  cbind(data.frame(month = rep(c("June", "July", "August", "September"), each = length(unique(df_dv1$party)) * length(unique(df_dv1$issue))))) %>%
  left_join(df_dv1) %>%
  left_join(df_dv2)

df_dv$pressrelease_salience[is.na(df_dv$pressrelease_salience)] <- 0
df_dv$pressrelease_volume[is.na(df_dv$pressrelease_volume)] <- 0

#---------------------------------------------------------------------------------------------------------------------

# Independent variables

## IV 1: manifesto salience
df_iv1 <- data_text_man
df_iv1$n <- 1

df_iv1 <- df_iv1[, c(3, 9:10)] %>%
  group_by(party, issue) %>% 
  mutate(N = sum(n)) %>% 
  group_by(party) %>%
  mutate(overall_num = sum(n)) %>%
  mutate(manifesto_salience = (N / overall_num) * 100) %>%
  unique()
df_iv1 <- df_iv1[c(1:2, 6)] %>%
  filter(issue != "NA")

## IV 2: public salience

### filter GLES data
data_gles <- filter(data_gles, label != "Error_Codes" & !is.na(label))

### get public salience and salience shifts per month
data_gles_may <- filter(data_gles, end_month == "May")
public_salience_may <- (table(data_gles_may$label) / nrow(data_gles_may)) * 100
df_public_may <- data.frame(issue = names(public_salience_may),
                             month = "June", # no survey in June, therefore May as proxy for June
                             public_salience = as.numeric(public_salience_may))

data_gles_july <- filter(data_gles, end_month == "July")
public_salience_july <- (table(data_gles_july$label) / nrow(data_gles_july)) * 100
df_public_july <- data.frame(issue = names(public_salience_july),
                               month = "July",
                               public_salience = as.numeric(public_salience_july))

data_gles_august <- filter(data_gles, end_month == "August")
public_salience_august <- (table(data_gles_august$label) / nrow(data_gles_august)) * 100
df_public_august <- data.frame(issue = names(public_salience_august),
                               month = "August",
                               public_salience = as.numeric(public_salience_august))

data_gles_september <- filter(data_gles, end_month == "September")
public_salience_september <- (table(data_gles_september$label) / nrow(data_gles_september)) * 100
df_public_september <- data.frame(issue = names(public_salience_september),
                               month = "September",
                               public_salience = as.numeric(public_salience_september))

### create data set for public salience
df_iv2 <- rbind(df_public_may, df_public_july, df_public_august, df_public_september)
df_iv2$issue <- gsub("_", " ", df_iv2$issue)

## IV 3: competitor salience

### get competitor salience and competitor salience at t-1
df_iv3 <- c()
for(i in 1:length(unique(data_text_pr$party))) {
  
  party_tmp <- sort(unique(data_text_pr$party))[i]
  
  # June
  pr_comp_jun <- filter(data_text_pr, party != party_tmp & month == "June" & type == "policy-related")
  res_jun <- data.frame(party = party_tmp,
                        month = "June",
                        table(pr_comp_jun$issue)/nrow(pr_comp_jun) * 100)
  colnames(res_jun)[3:4] <- c("issue", "competitor_salience") 
  
  # July
  pr_comp_jul <- filter(data_text_pr, party != party_tmp & month == "July" & type == "policy-related")
  res_jul <- data.frame(party = party_tmp,
                        month = "July",
                        competitor_salience = table(pr_comp_jul$issue)/nrow(pr_comp_jul) * 100)
  colnames(res_jul)[3:4] <- c("issue", "competitor_salience") 
  
  # August
  pr_comp_aug <- filter(data_text_pr, party != party_tmp & month == "August" & type == "policy-related")
  res_aug <- data.frame(party = party_tmp,
                         month = "August",
                         competitor_salience = table(pr_comp_aug$issue)/nrow(pr_comp_aug) * 100)
  colnames(res_aug)[3:4] <- c("issue", "competitor_salience") 
  
  # September
  pr_comp_sep <- filter(data_text_pr, party != party_tmp & month == "September" & type == "policy-related")
  res_sep <- data.frame(party = party_tmp,
                         month = "September",
                         competitor_salience = table(pr_comp_sep$issue)/nrow(pr_comp_sep) * 100)
  colnames(res_sep)[3:4] <- c("issue", "competitor_salience") 
  
  # rbind
  df_iv3 <- rbind(df_iv3, res_jun, res_jul, res_aug, res_sep)
  df_iv3$issue <- gsub("_", " ", df_iv3$issue)
  
}

### merge DV and IV data sets
data <- left_join(df_dv, df_iv1, by = c("party", "issue")) %>%
  left_join(df_iv2) %>%
  left_join(df_iv3)

data$competitor_salience[is.na(data$competitor_salience)] <- 0

## IV 4: external event

data$external_event <- 0
data$external_event[data$issue == "Foreign Affairs" & data$month == "August"] <- 1 #afghanistan
data$external_event[data$issue == "Defense" & data$month == "August"] <- 1 #afghanistan

data$external_event[data$issue == "Environment" & data$month == "July"] <- 1 #flood

## IV 5: polls

data <- left_join(data, data_poll, by = c("party", "month"))

#---------------------------------------------------------------------------------------------------------------------

# Control variables

## government participation
data$government <- 0
data$government[data$party == "CDU/CSU" | data$party == "SPD"] <- 1

## Left-right position
mp_data <- mp_maindataset(api = "06f2fbe770a02e83230a8b17fd65f373") %>%
  filter(countryname == "Germany") %>%
  group_by(partyabbrev) %>%
  filter(edate >= "2017-01-01") %>%
  select(c("partyabbrev", "edate", "rile", "pervote", "absseat", "totseats")) %>%
  mutate(perseat = (absseat / totseats * 100)) %>%
  mutate(pervote = dplyr::lag(pervote, n = 1), 
         absseat = dplyr::lag(absseat, n = 1),
         perseat = dplyr::lag(perseat, n = 1)) %>%
  filter(edate >= "2021-09-21") %>%
  select(c("partyabbrev", "edate", "rile", "pervote", "absseat", "perseat"))
    
mp_data$partyabbrev <- gsub("90/Greens", "Grüne", mp_data$partyabbrev)
mp_data$partyabbrev <- gsub("LINKE", "Linke", mp_data$partyabbrev)
colnames(mp_data)[1] <- "party"

data <- left_join(data, mp_data)

#---------------------------------------------------------------------------------------------------------------------

# Lagged variables

data <- data %>%
  group_by(party, issue) %>%
  mutate(pressrelease_salience_lagged = dplyr::lag(pressrelease_salience, n = 1)) %>%
  mutate(pressrelease_volume_lagged = dplyr::lag(pressrelease_volume, n = 1)) %>%
  mutate(public_salience_lagged = dplyr::lag(public_salience, n = 1)) %>%
  mutate(competitor_salience_lagged = dplyr::lag(competitor_salience, n = 1)) %>%
  mutate(external_event_lagged = dplyr::lag(external_event, n = 1)) %>%
  mutate(poll_trend_lagged = dplyr::lag(poll_trend, n = 1))

#---------------------------------------------------------------------------------------------------------------------

# Filter data set

data <- filter(data, month != "June")
data$month <- as.character(data$month)
data$month <- relevel(as.factor(data$month), ref = "July")

#---------------------------------------------------------------------------------------------------------------------

# Save final data set

saveRDS(data, "Analysis_Data.RDS")
